This scripts plots the coverage of 98k random positions, based on mpileup with indels removed. # Autosomal coverage

# cov is a sync file after removind indels
cov <- read.table("~/sex_ratio_EE/results/5.create_sync_file/autosomes/EE_SR_IndelsRm_RepeatsRm_autosomes_subsampled.sync")

# remove N and del (last 2 digits from each cell)
remove_last_digits <- function(x) {
  gsub(".{0,4}$", "", x)
}
cov[4:ncol(cov)] <- lapply(cov[4:ncol(cov)], remove_last_digits)

# add col names
sample_names <- read.table("~/sex_ratio_EE/data/sample_info.txt", sep = "\t", h = T)[,5]
colnames(cov) <- c("LG", "pos", "ref_allele", sample_names)

# sum values within each cell (sum the coverage of all nucleotides)
sum_split_values <- function(x) {
  sapply(strsplit(x, ":"), function(y) sum(as.numeric(y)))
}
  # apply the function
cov <- cov %>%
  mutate(across(contains("males"), sum_split_values))

# change the format to longer
cov_longer_autosomes <- cov %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("male"), # change format to longer
    names_to = c("line", "sex", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation, "_", sex)) 

Calculate mean coverage per line

# calculate mean coverage per line:
cov_longer_autosomes %>%
  group_by(line, generation, sex) %>%
  summarise(mean_coverage = mean(coverage)) 
## `summarise()` has grouped output by 'line', 'generation'. You can override
## using the `.groups` argument.
## # A tibble: 32 × 4
## # Groups:   line, generation [16]
##    line  generation sex     mean_coverage
##    <chr> <chr>      <chr>           <dbl>
##  1 FB-A  F1         females          47.9
##  2 FB-A  F1         males            29.5
##  3 FB-A  F28        females          53.5
##  4 FB-A  F28        males            48.4
##  5 FB-B  F1         females          40.9
##  6 FB-B  F1         males            32.5
##  7 FB-B  F28        females          59.4
##  8 FB-B  F28        males            50.4
##  9 FB-C  F1         females          47.5
## 10 FB-C  F1         males            30.2
## # ℹ 22 more rows

Plot raw coverage of each line

# plot coverage
cov_plot <- ggplot(cov_longer_autosomes, aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,200) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("Before summing coverage across sexes; autosomes only")

plot(cov_plot)
## Warning: Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).

# convert to a plotly plot
interactive_cov_plot <- ggplotly(cov_plot)
## Warning: Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot
# plot coverage (only female samples)
cov_longer_autosomes %>%
  filter(sex == "females") %>%
  ggplot(aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,200) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("Before summing coverage across sexes; female autosomes only") 
## Warning: Removed 1034 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1034 rows containing non-finite outside the scale range
## (`stat_density()`).

sum autosomal coverage of males and females

The target coverage for autosomes is 100.98 (vertical line) Therefore, the mean coverage across all lines has to be between 50X and 201X.

# cov_check is a sync file after merging male and female coverage
cov_merged <- read.table("~/sex_ratio_EE/results/5.create_sync_file/autosomes/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_autosomes_subsampled.sync")

# remove N and del (last 2 digits from each cell)
cov_merged[4:ncol(cov_merged)] <- lapply(cov_merged[4:ncol(cov_merged)], remove_last_digits)

# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1"  , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged) <- c("LG", "pos", "ref_allele", sample_names_check)

# sum values within each cell (sum the coverage of all nucleotides)
cov_merged <- cov_merged %>%
  mutate(across(contains("_F"), sum_split_values))

# change the format to longer
cov_merged_longer_autosomes <- cov_merged %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
    names_to = c("line", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation)) 

cov_merged_longer_autosomes %>%
  group_by(line, generation) %>%
  summarise(mean_cov = mean(coverage))
## `summarise()` has grouped output by 'line'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 3
## # Groups:   line [8]
##    line  generation mean_cov
##    <chr> <chr>         <dbl>
##  1 FB-A  F1             77.4
##  2 FB-A  F28           102. 
##  3 FB-B  F1             73.1
##  4 FB-B  F28           109. 
##  5 FB-C  F1             77.5
##  6 FB-C  F28            89.1
##  7 FB-D  F1             59.1
##  8 FB-D  F28            92.0
##  9 MB-A  F1             63.5
## 10 MB-A  F28            94.7
## 11 MB-B  F1             59.5
## 12 MB-B  F28           107. 
## 13 MB-C  F1             75.0
## 14 MB-C  F28           106. 
## 15 MB-D  F1             87.4
## 16 MB-D  F28           102.
# plot coverage
plot_cov_merged_longer_autosomes <- ggplot(cov_merged_longer_autosomes, aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,400) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("After summing coverage across sexes; autosomes only")+
  geom_vline(xintercept = 100.98)

plot(plot_cov_merged_longer_autosomes)
## Warning: Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).

# convert to a plotly plot
interactive_cov_plot_merged_autosomes <- ggplotly(plot_cov_merged_longer_autosomes)
## Warning: Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_merged_autosomes

Sex chromosome

Read the mpileup file with sex chromosome

# load coverage from the sex chromosome. Note that hash is not used as a comment character!
cov_sex <- read.table("~/sex_ratio_EE/results/5.create_sync_file/sexChr/EE_SR_IndelsRm_RepeatsRm_sexChromosome_subsampled.sync")

# remove N and del (last 2 digits from each cell)
remove_last_digits <- function(x) {
  gsub(".{0,4}$", "", x)
}
cov_sex[4:ncol(cov_sex)] <- lapply(cov_sex[4:ncol(cov_sex)], remove_last_digits)

# add col names
colnames(cov_sex) <- c("LG", "pos", "ref_allele", sample_names)

# sum values within each cell (sum the coverage of all nucleotides)
sum_split_values <- function(x) {
  sapply(strsplit(x, ":"), function(y) sum(as.numeric(y)))
}
  # apply the function
cov_sex <- cov_sex %>%
  mutate(across(contains("males"), sum_split_values))

# subset data and change format to longer
cov_longer_sexChr <- cov_sex %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("male"), # change format to longer
    names_to = c("line", "sex", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation, "_", sex)) %>%
  filter(LG == "LG4") # remove sex chromosome

# calculate mean coverage per line:
cov_longer_sexChr %>%
  group_by(line, generation, sex) %>%
  summarise(mean_coverage = mean(coverage)) 
## `summarise()` has grouped output by 'line', 'generation'. You can override
## using the `.groups` argument.
## # A tibble: 32 × 4
## # Groups:   line, generation [16]
##    line  generation sex     mean_coverage
##    <chr> <chr>      <chr>           <dbl>
##  1 FB-A  F1         females          44.7
##  2 FB-A  F1         males            18.2
##  3 FB-A  F28        females          50.1
##  4 FB-A  F28        males            27.2
##  5 FB-B  F1         females          37.3
##  6 FB-B  F1         males            18.9
##  7 FB-B  F28        females          51.5
##  8 FB-B  F28        males            28.0
##  9 FB-C  F1         females          43.1
## 10 FB-C  F1         males            17.3
## # ℹ 22 more rows

Female coverage of the sex chromosome

Target coverage = 50.41X

# plot female coverage of sex chromosome
cov_plot_sexChr <- cov_longer_sexChr %>%
  filter(sex == "females") %>%
  ggplot(aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,200) +
  ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  ggtitle("Sex chromosome, females") +
  geom_vline(xintercept = 52.4)

plot(cov_plot_sexChr)
## Warning: Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).

# convert to a plotly plot
interactive_cov_plot_sexChr <- ggplotly(cov_plot_sexChr)
## Warning: Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_sexChr

Male coverage of the sex chromosome Tagret coverage = 16.04X

# plot male coverage of sex chromosome
cov_plot_sexChr_males <- cov_longer_sexChr %>%
  filter(sex == "males") %>%
  ggplot(aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,200) +
  ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  ggtitle("Sex chromosome, males") +
  geom_vline(xintercept = 19.6)

plot(cov_plot_sexChr_males)
## Warning: Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).

# convert to a plotly plot
interactive_cov_plot_sexChr_males <- ggplotly(cov_plot_sexChr_males)
## Warning: Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_sexChr_males

check the coverage distribution after male and female coverage on the sex chromosome was summed

The target coverage for the sex chromosome is 77.5X (vertical line) Therefore, the mean coverage across all lines has to be between 38X and 155X.

# cov_check is a sync file after merging male and female coverage
cov_merged_sex <- read.table("~/sex_ratio_EE/results/5.create_sync_file/sexChr/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_sexChromosome_subsampled.sync")

# remove N and del (last 2 digits from each cell)
cov_merged_sex[4:ncol(cov_merged_sex)] <- lapply(cov_merged_sex[4:ncol(cov_merged_sex)], remove_last_digits)

# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1"  , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_sex) <- c("LG", "pos", "ref_allele", sample_names_check)

# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_sex <- cov_merged_sex %>%
  mutate(across(contains("_F"), sum_split_values))

# change the format to longer
cov_merged_longer_sex <- cov_merged_sex %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
    names_to = c("line", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation)) 

# plot coverage
plot_cov_merged_longer_sex <- ggplot(cov_merged_longer_sex, aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,400) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("After summing coverage across sexes; sex chromosome only")+
  geom_vline(xintercept = 77.5)

plot(plot_cov_merged_longer_sex)
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).

# convert to a plotly plot
interactive_cov_plot_merged_sex <- ggplotly(plot_cov_merged_longer_sex)
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_merged_sex

coverage distribution after filtering

autosomes

# cov_check is a sync file after merging male and female coverage
cov_merged_covFilt <- read.table("~/sex_ratio_EE/results/6.filter_by_coverage/withoutAdditionalFiltering/autosomes/all_lines//EE_SR_IndelsRm_RepeatsRm_sexMergedCov_autosomes_filteredCov_subsampled.sync")

# remove N and del (last 2 digits from each cell)
cov_merged_covFilt[4:ncol(cov_merged_covFilt)] <- lapply(cov_merged_covFilt[4:ncol(cov_merged_covFilt)], remove_last_digits)

# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1"  , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_covFilt) <- c("LG", "pos", "ref_allele", sample_names_check)

# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_covFilt <- cov_merged_covFilt %>%
  mutate(across(contains("_F"), sum_split_values))

# change the format to longer
cov_merged_covFilt_longer <- cov_merged_covFilt %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
    names_to = c("line", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation)) 

# plot coverage
plot_cov_merged_covFilt_longer <- ggplot(cov_merged_covFilt_longer, aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,400) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("After summing coverage across sexes; sex chromosome only", subtitle = "After filtering on coverage")+
  geom_vline(xintercept = 107.24) +
  geom_vline(xintercept = 107.24/2, linetype = "dashed") +
  geom_vline(xintercept = 107.24 * 2, linetype = "dashed")

plot(plot_cov_merged_covFilt_longer)

# convert to a plotly plot
interactive_plot_cov_merged_covFilt_longer <- ggplotly(plot_cov_merged_covFilt_longer)
interactive_plot_cov_merged_covFilt_longer
cov_merged_covFilt_longer %>% 
  group_by(generation, line) %>%
  summarise(mean_coverage_autosomes = mean(coverage))
## `summarise()` has grouped output by 'generation'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 3
## # Groups:   generation [2]
##    generation line  mean_coverage_autosomes
##    <chr>      <chr>                   <dbl>
##  1 F1         FB-A                    106. 
##  2 F1         FB-B                    100. 
##  3 F1         FB-C                    106. 
##  4 F1         FB-D                     81.1
##  5 F1         MB-A                     87.0
##  6 F1         MB-B                     81.3
##  7 F1         MB-C                    102. 
##  8 F1         MB-D                    120. 
##  9 F28        FB-A                    140. 
## 10 F28        FB-B                    151. 
## 11 F28        FB-C                    123. 
## 12 F28        FB-D                    127. 
## 13 F28        MB-A                    131. 
## 14 F28        MB-B                    147. 
## 15 F28        MB-C                    146. 
## 16 F28        MB-D                    141.
pos <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5, seed = 2)

plot_cov <- cov_merged_covFilt_longer %>% 
  group_by(generation, line) %>%
  summarise(mean_coverage_autosomes = mean(coverage)) %>%
  mutate(treatment = ifelse(str_detect(line, "^F"), "female-biased", "male_biased")) %>%
  ggplot(aes(generation, mean_coverage_autosomes, color = treatment, label = line)) +
    geom_point(position = pos) +
    ggrepel::geom_label_repel(position = pos) +
  theme_minimal() +
    theme(legend.position = "none") 
## `summarise()` has grouped output by 'generation'. You can override using the
## `.groups` argument.
ggsave("~/sex_ratio_EE/results/5.create_sync_file/mean_coverage_per_line.png", plot = plot_cov, width = 10, height = 7, dpi = 300)

sex chromosome

# cov_check is a sync file after merging male and female coverage

cov_merged_covFilt_sex <- read.table("~/sex_ratio_EE/results/6.filter_by_coverage/withoutAdditionalFiltering/sexChr/all_lines/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_sexChromosome_filteredCov_subsampled.sync")

# remove N and del (last 2 digits from each cell)
cov_merged_covFilt_sex[4:ncol(cov_merged_covFilt_sex)] <- lapply(cov_merged_covFilt_sex[4:ncol(cov_merged_covFilt_sex)], remove_last_digits)

# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1"  , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_covFilt_sex) <- c("LG", "pos", "ref_allele", sample_names_check)

# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_covFilt_sex <- cov_merged_covFilt_sex %>%
  mutate(across(contains("_F"), sum_split_values))

# change the format to longer
cov_merged_covFilt_sex_longer <- cov_merged_covFilt_sex %>%
  arrange(LG, pos) %>% # sort the mpileup according to position
  pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
    names_to = c("line", "generation"),
    names_sep = "_",
    values_to = "coverage")%>%
  mutate(line_generation = paste0(line, "_", generation)) 

# plot coverage
plot_cov_merged_covFilt_sex_longer <- ggplot(cov_merged_covFilt_sex_longer, aes(coverage)) +
  geom_density(aes(color = line_generation)) +
  geom_density(linewidth = 1.5) +
  xlim(0,400) +
  #ylim(0,0.055) +
  scale_color_viridis(discrete = T) +
  theme(legend.position = "none")+
  #facet_wrap(~generation) +
  ggtitle("After summing coverage across sexes; sex chromosome only", subtitle = "After filtering on coverage")+
  geom_vline(xintercept = 73.58)  +
  geom_vline(xintercept = 73.58/2, linetype = "dashed") +
  geom_vline(xintercept = 73.58 * 2, linetype = "dashed")

plot(plot_cov_merged_covFilt_sex_longer)

# convert to a plotly plot
interactive_plot_cov_merged_covFilt_sex_longer <- ggplotly(plot_cov_merged_covFilt_sex_longer)
interactive_plot_cov_merged_covFilt_sex_longer